{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "441041e9-9eee-46a2-9880-abc30a63c172",
   "metadata": {},
   "outputs": [],
   "source": [
    "import openai,os\n",
    "from dotenv import load_dotenv\n",
    "from llama_index.llms import OpenAI\n",
    "\n",
    "load_dotenv()\n",
    "\n",
    "openai_api_key=os.getenv('OPENAI_API_KEY')\n",
    "\n",
    "openai.api_key = openai_api_key\n",
    "llm = OpenAI(model=\"gpt-3.5-turbo\", temperature=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "b1a77546-10f5-43ed-ad67-ffae54679e81",
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index import VectorStoreIndex, SimpleDirectoryReader, Document\n",
    "\n",
    "reader = SimpleDirectoryReader(input_files=['./pdf_files/sample2.pdf'])\n",
    "docs = reader.load_data()\n",
    "\n",
    "document = Document(text=\"\\n\\n\".join([doc.text for doc in docs]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "90a05c60-d502-4dab-9c88-860e54bed026",
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index.vector_stores import ElasticsearchStore\n",
    "\n",
    "vector_store = ElasticsearchStore(\n",
    "    es_url=\"http://localhost:9200\",\n",
    "    index_name=\"books\"  # If this index doesn't exist, a new one is created\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "7f41e998-e70c-4283-ab9e-23d4330969a4",
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index import ServiceContext, VectorStoreIndex, StorageContext\n",
    "from llama_index.node_parser import SentenceWindowNodeParser\n",
    "from llama_index.indices.postprocessor import MetadataReplacementPostProcessor\n",
    "from llama_index.indices.postprocessor import SentenceTransformerRerank\n",
    "\n",
    "\n",
    "def build_sentence_window_index(\n",
    "    document, llm, vector_store, embed_model=\"local:BAAI/bge-small-en-v1.5\"\n",
    "):\n",
    "    node_parser = SentenceWindowNodeParser.from_defaults(\n",
    "        window_size=3,\n",
    "        window_metadata_key=\"window\",\n",
    "        original_text_metadata_key=\"original_text\",\n",
    "    )\n",
    "    sentence_context = ServiceContext.from_defaults(\n",
    "        llm=llm,\n",
    "        embed_model=embed_model,\n",
    "        node_parser=node_parser\n",
    "    )\n",
    "    storage_context = StorageContext.from_defaults(vector_store=vector_store)\n",
    "    sentence_index = VectorStoreIndex.from_documents(\n",
    "        [document], service_context=sentence_context, storage_context=storage_context\n",
    "    )\n",
    "\n",
    "    return sentence_index\n",
    "\n",
    "def get_sentence_window_query_engine(\n",
    "    sentence_index,\n",
    "    similarity_top_k=6,\n",
    "    rerank_top_n=2,\n",
    "):\n",
    "    postproc = MetadataReplacementPostProcessor(target_metadata_key=\"window\")\n",
    "    rerank = SentenceTransformerRerank(\n",
    "        top_n=rerank_top_n, model=\"BAAI/bge-reranker-base\"\n",
    "    )\n",
    "\n",
    "    sentence_window_engine = sentence_index.as_query_engine(\n",
    "        similarity_top_k=similarity_top_k, node_postprocessors=[postproc, rerank]\n",
    "    )\n",
    "    return sentence_window_engine"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "ea8530f7-3efb-44a9-a276-0f0e03473eda",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ef6e3cf59086483abedac8f983150f8c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "bf6655967daa40018d87a7209f67d143",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0afeccbaa77a497e98940d20e76995fd",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "72825989c8694746bc747fb34e6cb544",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "387bf2d2f4af44b996a2493e8adb3f29",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b85c5c5cefca4ff29bd8a70f5a1dd332",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "9fd62f244c3745c8a49db10a7ba3eb44",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "config.json:   0%|          | 0.00/799 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "28ff92d8858b48979c6c59e729a7f212",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "35c0664222ba4ac2880783e8d14bae87",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7567c734ec104777ac4a87745aab9b92",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "56fd181e66d4437c8929fbeaa633604a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "fab393411b744dbca2c9b2d397a14ab4",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "sentence_index = build_sentence_window_index(\n",
    "    document,\n",
    "    llm,\n",
    "    embed_model=\"local:BAAI/bge-small-en-v1.5\",\n",
    "    vector_store=vector_store\n",
    ")\n",
    "\n",
    "query_engine = get_sentence_window_query_engine(sentence_index=sentence_index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "db479595-8f96-486a-9215-c41b43b63e5b",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The article is about a mysterious woman named Vida Winter who wants to reveal the truth about herself to the person reading the letter. The author reflects on the power of words and how they can captivate and affect a person.\n"
     ]
    }
   ],
   "source": [
    "resp = query_engine.query(\n",
    "    \"what is the article about\"\n",
    ")\n",
    "print(resp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "7f7a1ec9-79a2-4196-aecc-a99acc98413b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "No, you did not tell him the truth.\n"
     ]
    }
   ],
   "source": [
    "resp = query_engine.query(\n",
    "    \"Did I tell him the truth\"\n",
    ")\n",
    "print(resp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "5a586653-8132-4d03-806f-6e530b00589f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The duration of time that the person sat on the stairs after reading the letter is unknown.\n"
     ]
    }
   ],
   "source": [
    "resp = query_engine.query(\n",
    "    \"How long did I sit on the stairs after reading the letter?\"\n",
    ")\n",
    "print(resp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "7b0d47d4-a35a-4d1c-bf3b-0025d1b2408e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "You knew very little about Vida Winter.\n"
     ]
    }
   ],
   "source": [
    "resp = query_engine.query(\n",
    "    \"How much did I know about Vida winter?\"\n",
    ")\n",
    "print(resp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "59cccd86-502e-4447-a25b-9c5b2683b298",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python_3.11",
   "language": "python",
   "name": "python_3.11"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
